import numpy as np
import sqlite3
import matplotlib.pylab as plt
import datetime as DT
import seaborn as sns
np.set_printoptions(precision=5)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#### import the PCA library from scikit learn library
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D # didn't get to use this
%matplotlib inline
# Normalizing (scaling) the data is VERY important - indeed can be important to many machine
# learning algorithms. Take the original features and scale them so that they all have zero
# mean and unit variance
from sklearn import preprocessing
## pandas actually has a command to read_sql or read_sql_query and return a pandas.DataFrame
## coerce_float=True argument to force float data type
## Need to FIRST connect to the database by creating a connection object called conn.
conn = sqlite3.connect('database.sqlite')
# create a cursor object using the conn object method
# The cursor object has methods for accessing the data
# c = conn.cursor() # this is not needed for the pd.read_sql IO tool
# Get the database table list from information in the sqlite_master table
# Follow convention to type SQL commands in all caps
# preview all tables in the database
print ('======')
print ('Tables in the database')
# Set the execute SQL command, Fetch and print all table names and info, and
# return a pandas DataFrame
df_tables = pd.read_sql("""SELECT * FROM sqlite_master WHERE type='table';""", conn)
print('df_tables shape: ', df_tables.shape)
print(df_tables)
print ('======')
print('Player_Attributes table:')
print(df_tables.sql[1]) # get sql that CREATE the Player_Atrribtues table
print ('======')
print('Player table:')
print(df_tables.sql[2]) # get sql that CREATE the Player table
print ('======')
print ('Player table')
df_Player = pd.read_sql("""SELECT * FROM Player """, conn)
print('df_Player.shape:', df_Player.shape)
print(df_Player.columns)
print(df_Player.head())
print ('======')
print ('Player_Attributes table')
df_Player_Attributes = pd.read_sql("""SELECT * FROM Player_Attributes""", conn)
print('df_Player_Attributes.shape:', df_Player_Attributes.shape)
print(df_Player_Attributes.columns)
print(df_Player_Attributes.head())
print ('======')
# acquire data from database using pd.read_sql_query(sql, , ,)
# build SQL to SELECT all columns from both Player and Player_Attributes tables
# for rows reocrds w/ matching player_fifa_api_id
sql="SELECT * FROM Player INNER JOIN Player_Attributes ON Player.player_fifa_api_id=Player_Attributes.player_fifa_api_id;"
df_all_col=pd.read_sql_query(sql, conn, coerce_float=True, params=None, parse_dates=['birthday','date'], chunksize=None)
# calculate age of player at the time attributes were collected
df_all_col['age'] = (df_all_col.date - df_all_col.birthday).astype('timedelta64[Y]')
#Tally total score per player attribute category
df_all_col['total_attack'] = df_all_col.crossing + df_all_col.finishing + df_all_col.heading_accuracy + \
df_all_col.short_passing + df_all_col.volleys
df_all_col['total_skill'] = df_all_col.dribbling + df_all_col.curve + df_all_col.free_kick_accuracy + df_all_col.long_passing + \
df_all_col.ball_control
df_all_col['total_movement'] = df_all_col.acceleration + df_all_col.sprint_speed + df_all_col.agility + \
df_all_col.reactions + df_all_col.balance
df_all_col['total_power'] = df_all_col.shot_power + df_all_col.jumping + df_all_col.stamina + df_all_col.strength + \
df_all_col.long_shots
df_all_col['total_mentality'] = df_all_col.aggression + df_all_col.interceptions + df_all_col.positioning + \
df_all_col.vision + df_all_col.penalties
df_all_col['total_defending'] = df_all_col.marking + df_all_col.standing_tackle + df_all_col.sliding_tackle
df_all_col['total_goalkeeping'] = df_all_col.gk_diving + df_all_col.gk_handling + df_all_col.gk_kicking + \
df_all_col.gk_positioning + df_all_col.gk_reflexes
print('df_all_col.columns: ', df_all_col.columns) # print column labels for all columns from both tables
print('df_all_col.shape:', df_all_col.shape)
print(df_all_col.info())
# identify non_numeric and numeric columns of interest and create two lists of column labels
non_numeric_col=['player_fifa_api_id', 'player_api_id','player_name', 'birthday', 'date', \
'preferred_foot', 'attacking_work_rate', 'defensive_work_rate']
numeric_col = ['age', 'height', 'weight','overall_rating', 'potential','crossing', 'finishing', \
'heading_accuracy','short_passing', 'volleys', 'dribbling', 'curve', \
'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', \
'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', \
'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',\
'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', \
'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']
numeric_few_col = ['age', 'height', 'weight','overall_rating', 'potential', 'total_attack', 'total_skill', \
'total_movement', 'total_power', 'total_mentality', 'total_defending', 'total_goalkeeping']
df_all_col.replace(r'\s+', np.nan, regex=True, inplace = True)
df_all_col.dropna(axis=0, how='any', inplace=True) #drop row (sample) with any NA entry
df_all_col.sort_values('player_name',axis=0, inplace=True)
df_all_col.drop_duplicates(inplace=True)
df_all_col.to_csv('df_all_col.csv')
print ('df_all_col.shape: ', df_all_col.shape)
print(df_all_col.shape)
print(df_all_col.info())
print(df_all_col.head())
print(df_all_col.tail())
print(df_all_col['defensive_work_rate'][0:60]) # need more data cleaning for col before ploting
print ('======')
df_unscaled_data = df_all_col[numeric_col]
print('df_unscaled_data.columns:', df_unscaled_data.columns)
print('df_unscaled_data.shape:', df_unscaled_data.shape)
print('df_unscaled_data.info: ', df_unscaled_data.info())
scaled_data = preprocessing.scale(df_unscaled_data) #center and scale the data
print('scaled data:')
print (scaled_data) # preview scaled data
# create a PCA object.
# sklean uses this PCA object that can be trained using one dataset and applied to another dataset
pca = PCA()
print(type(pca))
# do PCA math, calculate loading scores and the variation each PCA accounts for
pca.fit(scaled_data)
# generate coordinates for a PCA graph based on the loading scores and the scaled data
pca_data = pca.transform(scaled_data)
# pca.explained_variance_ratio_ is <class 'numpy.ndarray'>.
# It calculates the percentage of variance that each principal component accoutns for
per_var = np.round(pca.explained_variance_ratio_*100, decimals =1)
print('=======================')
print('percent of explained variance: ')
print(per_var)
PC_labels = ['PC'+ str(x) for x in range(1,len(per_var)+1)] # labels for the Scree Plot: PC1, PC2 ...
print(' ')
print('=======================')
# create Scree Plot
plt.figure(figsize=(18, 6))
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=PC_labels)
plt.ylabel('Percentage of Explained Variance', fontsize='14')
plt.xlabel('Principal Component', fontsize='14')
plt.title('Scree Plot', fontsize='18')
plt.show()
plt.close()
# put pca_data with DataFrame with PC_labels
pca_df = pd.DataFrame(pca_data, index=None, columns=PC_labels)
print(pca_df.head()) # preview transformed and scaled
print('=======================')
print('Principal Components Scatter Matrix')
df_pc_matrix= pca_df[['PC'+ str(x) for x in range(1,21)]] # scatter matrix for PC1, PC2, ..., PC15
pd.plotting.scatter_matrix(df_pc_matrix, alpha=0.1, figsize=(14, 14), diagonal='kde',range_padding =0.1)
plt.tight_layout()
plt.show()
plt.close()
loading_scores_PC1 = pd.Series(pca.components_[0],index=numeric_col)
loading_scores_PC1_sorted = loading_scores_PC1.abs().sort_values(ascending=False)
print('Sorted PC1 Loading Scores (abs)')
print('PC1 sorted components: ', loading_scores_PC1_sorted.index)
print(loading_scores_PC1_sorted)
print('=======')
loading_scores_PC2 = pd.Series(pca.components_[1],index=numeric_col)
loading_scores_PC2_sorted = loading_scores_PC2.abs().sort_values(ascending=False)
print('Sorted PC2 Loading Scores (abs)')
print('PC2 sorted components: ', loading_scores_PC2_sorted.index)
print(loading_scores_PC2_sorted)
print('=======')
loading_scores_PC3 = pd.Series(pca.components_[2],index=numeric_col)
loading_scores_PC3_sorted = loading_scores_PC3.abs().sort_values(ascending=False)
print('Sorted PC3 Loading Scores (abs)')
print('PC3 sorted components: ', loading_scores_PC3_sorted.index)
print(loading_scores_PC3_sorted)
Note: When plotting PC1 versus PC2 AND PC1 versus PC3, two clusters are displayed.
Next, we will visualize further in PC1, PC2 and PC3.
# draw PCA 2D plot: PC1 Vs PC2 and PC1 Vs PC3
def color_plot (i):
plt.figure(figsize=(10, 5))
plt.subplot(1,2,1)
plt.scatter(pca_df.PC1, pca_df.PC2, c=scaled_data[:,i], alpha=0.1)
plt.title("PCA Graph: PC1 Versus PC2 - color by " + numeric_col[i], fontsize='12')
plt.xlabel('PC1 - {0}%'.format(per_var[0]), fontsize='12')
plt.ylabel('PC2 - {0}%'.format(per_var[1]), fontsize='12')
plt.tight_layout()
plt.subplot(1,2,2)
plt.scatter(pca_df.PC1, pca_df.PC3, c=scaled_data[:,i], alpha=0.1)
plt.title("PCA Graph: PC1 Versus PC3 - color by " + numeric_col[i], fontsize='12')
plt.xlabel('PC1 - {0}%'.format(per_var[0]), fontsize='12')
plt.ylabel('PC3 - {0}%'.format(per_var[2]), fontsize='12')
plt.tight_layout()
plt.show()
plt.close()
for j in range(0,38):
color_plot (j)
#'attacking_work_rate', 'defensive_work_rate']
#plt.title('players with gk_diving > 50 (goalkeepers)')
# create scatter plot with ball control (highest loading score in PC1) against marking (highest loading score in PC2)
def plot (df_all, df_sub, hue_col):
# first plot
vis1=sns.lmplot(x='ball_control', y='marking', hue=hue_col, sharex=False, data=df_all, scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('all players')
plt.xlim(0,100)
plt.ylim(0,100)
plt.show()
plt.close()
# second plot: goalkeepers only
vis2=sns.lmplot(x='ball_control', y='marking', hue=hue_col, sharex=False, data=df_sub, scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('players with gk_diving > 50 (goalkeepers)')
plt.xlim(0,100)
plt.ylim(0,100)
plt.show()
plt.close()
print('gk_diving > 50 (goalkeepers)')
df_goalkeepers=df_all_col.loc[df_all_col['gk_diving']>50]
#print(df_goalkeepers.head())
plot(df_all_col, df_goalkeepers, None)
# color by lefty and righty
plot(df_all_col, df_goalkeepers, 'preferred_foot')
# plot lefty only
print('plot preferred left foot')
df1=df_all_col.loc[df_all_col['preferred_foot']=='left']
df2=df_goalkeepers.loc[df_goalkeepers['preferred_foot']=='left']
plot(df1,df2,None)
# color by 'attaching_work_rate'
df3=df_all_col[df_all_col['attacking_work_rate'].isin (['low','medium','high'])]
df4=df_goalkeepers[df_goalkeepers['attacking_work_rate'].isin (['low','medium','high'])]
plot(df3, df4, 'attacking_work_rate')
#plot jointplot with goal keeper attributes:
#'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes'
def joint_plot (df, title) :
vis=sns.jointplot(x='ball_control',y='gk_diving', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_handling', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_kicking', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_positioning', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_reflexes', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_diving', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_handling', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_kicking', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_positioning', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_reflexes', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
'''
vis=sns.jointplot(x='dribbling',y='gk_diving', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_handling', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_kicking', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_positioning', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_reflexes', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
'''
# df_goalkeepers=df_all_col.loc[df_all_col['gk_diving']>40]
joint_plot (df_all_col, 'all players')
joint_plot (df_goalkeepers, 'gk_diving > 50 (goalkeepers)')
import seaborn as sns
df1=df_all_col[df_all_col['defensive_work_rate'].isin (['low','medium','high'])]
df2=df_all_col[df_all_col['defensive_work_rate'].isin (['high'])]
df3=df_all_col[df_all_col['defensive_work_rate'].isin (['medium'])]
df4=df_all_col[df_all_col['defensive_work_rate'].isin (['low'])]
def lmplot (df):
vis=sns.lmplot(x='marking', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Colored By Defensive Work Rate')
plt.show()
plt.close()
vis=sns.lmplot(x='standing_tackle', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Colored By Defensive Work Rate')
plt.show()
plt.close()
vis=sns.lmplot(x='sliding_tackle', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Colored By Defensive Work Rate')
plt.show()
plt.close()
vis=sns.lmplot(x='interceptions', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Colored By Defensive Work Rate')
plt.show()
plt.close()
lmplot(df1) #color by 'defensive_work_rate'].isin (['low','medium','high'])
lmplot(df2) #color by 'defensive_work_rate'].isin (['high'])
lmplot(df3) #color by 'defensive_work_rate'].isin (['medium'])
lmplot(df4) #color by 'defensive_work_rate'].isin (['low'])
df_totals=df_all_col[numeric_few_col]
pd.plotting.scatter_matrix(df_totals, alpha=0.1, figsize=(16, 16), diagonal='kde',range_padding =0.01)
plt.tight_layout()
plt.show()
plt.close()
total_cols = numeric_few_col + ['player_fifa_api_id', 'player_name']
df_t=df_all_col[total_cols]
df_t.to_csv('player_total_score_per_attributes_category.csv')
print(df_t.shape)
print (df_t.head())
# Total Goalkeeping
vis=sns.lmplot(x='total_goalkeeping', y='overall_rating', hue = None, sharex=False, data=df_totals, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Total Goalkeeping Versus Overall Rating')
plt.show()
plt.close()
df_totals_gk = df_totals[df_totals['total_goalkeeping'] > 250]
corr_gk=df_totals_gk[['total_goalkeeping','overall_rating']].corr()
print ('A Closer Look at the Goalkeeper Subgroup on the Far Right')
print(' total_goalkeeping > 250 ')
print(' ')
print('Strong Positive Linear Correlation: ')
print(corr_gk)
vis=sns.lmplot(x='total_goalkeeping', y='overall_rating', hue = None, sharex=False, data=df_totals_gk, \
scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.title('Goalkeeper Subgroup: Total Goalkeeping Versus Overall Rating')
plt.show()
plt.close()
# Total Mentality
vis=sns.lmplot(x='total_mentality', y='overall_rating', hue = None, sharex=False, data=df_totals, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Total Mentality Versus Overall Rating')
plt.show()
plt.close()
df_totals_non_gk = df_totals[df_totals['total_goalkeeping'] < 250]
corr_mentality=df_totals_non_gk[['total_mentality','overall_rating']].corr()
print ('A Closer Look at the Non_goalkeeper Subgroup on the Far Right')
print(' total_goalkeeping < 250')
print(' ')
print ('Moderate Positive Linear Correlation: ')
print(corr_mentality)
vis=sns.lmplot(x='total_mentality', y='overall_rating', hue = None, sharex=False, data=df_totals_non_gk, \
scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.title('Non_Goalkeeper Subgroup: Total Mentality Versus Overall Rating')
plt.show()
plt.close()
# Total Atack
vis=sns.lmplot(x='total_attack', y='overall_rating', hue = None, sharex=False, data=df_totals, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Total Attack Versus Overall Rating')
plt.show()
plt.close()
df_totals_non_gk = df_totals[df_totals['total_goalkeeping'] < 250]
corr_attack=df_totals_non_gk[['total_attack','overall_rating']].corr()
print ('A Closer Look at the Non_goalkeeper Subgroup on the Far Right')
print(' total_goalkeeping < 250')
print(' ')
print ('Moderate Positive Linear Correlation: ')
print(corr_attack)
vis=sns.lmplot(x='total_attack', y='overall_rating', hue = None, sharex=False, data=df_totals_non_gk, \
scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.title('Non_Goalkeeper Subgroup: Total Attack Versus Overall Rating')
plt.show()
plt.close()
Exactly –1. A perfect downhill (negative) linear relationship
• –0.70. A strong downhill (negative) linear relationship
• –0.50. A moderate downhill (negative) relationship
• –0.30. A weak downhill (negative) linear relationship
• 0. No linear relationship
# save correlation coefficient for dataset to csv
df_corr = df_unscaled_data.corr()
df_corr.to_csv('df_corr.csv')
df_gk=df_unscaled_data.loc[df_unscaled_data['gk_diving']>50]
df_gk_corr = df_gk.corr()
df_gk_corr.to_csv('df_gk_corr.csv')
df_non_gk=df_unscaled_data.loc[df_unscaled_data['gk_diving']<50]
df_non_gk_corr = df_non_gk.corr()
df_non_gk_corr.to_csv('df_non_gk_corr.csv')
print('Unscaled Data Scatter Matrix 1')
print('PC1 sorted components: ', loading_scores_PC1_sorted.index[0:19])
#col_of_interest = ['ball_control', 'dribbling', 'short_passing', 'crossing', 'curve','long_shots', 'positioning', 'shot_power', 'vision', 'gk_diving','free_kick_accuracy', 'gk_reflexes', 'gk_handling']
col_of_interest = loading_scores_PC1_sorted.index[0:19]
df_col_of_interest= df_unscaled_data[col_of_interest] # scatter matrix for columns of interest
pd.plotting.scatter_matrix(df_col_of_interest, alpha=0.1, figsize=(16, 16), diagonal='kde',range_padding =0.01)
plt.tight_layout()
plt.show()
plt.close()
print('Unscaled Data Scatter Matrix 2')
print('PC1 sorted components: ', loading_scores_PC1_sorted.index[19:38])
#col_of_interest = ['ball_control', 'dribbling', 'short_passing', 'crossing', 'curve','long_shots', 'positioning', 'shot_power', 'vision', 'gk_diving','free_kick_accuracy', 'gk_reflexes', 'gk_handling']
col_of_interest = loading_scores_PC1_sorted.index[19:38]
df_col_of_interest= df_unscaled_data[col_of_interest] # scatter matrix for columns of interest
pd.plotting.scatter_matrix(df_col_of_interest, alpha=0.1, figsize=(16, 16), diagonal='kde',range_padding =0.01)
plt.tight_layout()
plt.show()
plt.close()
# create distribution plot for all features
final_col = ['player_fifa_api_id','preferred_foot','attacking_work_rate', 'defensive_work_rate'] + numeric_col
print(final_col)
df_final=df_all_col[final_col]
'''
df_final=def_all_col['player_fifa_api_id', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate', 'age', \
'height', 'weight', 'overall_rating', 'potential', 'crossing', 'finishing', 'heading_accuracy', \
'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', \
'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', \
'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', \
'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', \
'gk_kicking', 'gk_positioning', 'gk_reflexes']
'''
print(df_final.head())
print(len(final_col))
df_final.to_csv("df_final.csv")
# distplot for goalkeeping attributes
fig = plt.figure(figsize=(24,18))
ax1 = fig.add_subplot(321)
ax2 = fig.add_subplot(322)
ax3 = fig.add_subplot(323)
ax4 = fig.add_subplot(324)
ax5 = fig.add_subplot(325)
vis1=sns.distplot (df_all_col['gk_diving'], bins=30, ax=ax1)
vis2=sns.distplot (df_all_col['gk_handling'], bins=30, ax=ax2)
vis3=sns.distplot (df_all_col['gk_kicking'], bins=30, ax=ax3)
vis4=sns.distplot (df_all_col['gk_positioning'], bins=30, ax=ax4)
vis5=sns.distplot (df_all_col['gk_reflexes'], bins=30, ax=ax5)
plt.show()
plt.close()
# distplot for defending attributes
fig = plt.figure(figsize=(18,5))
ax6 = fig.add_subplot(131)
ax7 = fig.add_subplot(132)
ax8 = fig.add_subplot(133)
vis6=sns.distplot (df_all_col['marking'], bins=30, ax=ax6)
vis7=sns.distplot (df_all_col['standing_tackle'], bins=30, ax=ax7)
vis8=sns.distplot (df_all_col['sliding_tackle'], bins=30, ax=ax8)
plt.show()
plt.close()
# distplot
for i in range (4,42) :
sns.distplot (df_all_col[final_col[i]], bins=30)
plt.title('Distribution Plot')
plt.show()
plt.close()
####
def print_corr (x_attribute, y_attribute, df, title):
corr15=df[[x_attribute, y_attribute]].corr()
print(title)
print(corr15)
vis15=sns.lmplot( x=x_attribute, y=y_attribute, hue=None, sharex=False, data=df, scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.show()
plt.close()
###
print('A Closer Look at Overall Rating Versus Reactions Attributes')
print(' ')
print_corr('reactions', 'overall_rating', df_all_col, 'All Players')
# hue: attacking work rate
df11=df_all_col[df_all_col['attacking_work_rate'].isin (['low','medium','high'])]
corr11=df11[['reactions','overall_rating']].corr()
print(corr11)
vis11=sns.lmplot( x='reactions', y='overall_rating', hue='attacking_work_rate', sharex=False, data=df11, scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.show()
# hue: defensive work rate
df12=df_all_col[df_all_col['defensive_work_rate'].isin (['low','medium','high'])]
corr12=df12[['reactions','overall_rating']].corr()
print(corr12)
vis12=sns.lmplot( x='reactions', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df12, scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.show()
df_goalkeepers=df_all_col[df_all_col['gk_diving'] > 50] # goalkeepers
df_non_goalkeepers=df_all_col[df_all_col['gk_diving'] < 50] # non_goalkeepers
##### Subgroups
print ('#######')
print (' ')
print ('Non_Goalkeeper Subgroup')
print (' ')
#df_non_goalkeepers=df_all_col[df_all_col['gk_diving'] < 50] # non_goalkeepers
print_corr ('reactions','overall_rating', df_non_goalkeepers, 'Non_Goalkeepers: gk_diving < 50')
print ('#######')
print (' ')
print ('Goalkeeper Subgroup')
print (' ')
#df_goalkeepers=df_all_col[df_all_col['gk_diving'] > 50] # goalkeepers
# strongest correlation
print_corr ('gk_diving','overall_rating',df_goalkeepers, 'Goalkeepers Only: gk_diving > 50')
# 2nd strongest correlation
print_corr ('gk_reflexes','overall_rating',df_goalkeepers, 'Goalkeepers Only: gk_diving > 50')
# 3rd strongest correlation
print_corr ('gk_positioning','overall_rating',df_goalkeepers, 'Goalkeepers Only: gk_diving > 50')
# 4th strongest correlation
print_corr ('gk_handling','overall_rating',df_goalkeepers, 'Goalkeepers Only: gk_diving > 50')
# 5th strongest correlation
print_corr ('gk_kicking','overall_rating',df_goalkeepers, 'Goalkeepers Only: gk_diving > 50')
# 6th strongest correlation
print_corr ('reactions','overall_rating', df_goalkeepers, 'Goalkeepers Only: gk_diving > 50')